#ifndef __SPLITTOOL_H__
#define __SPLITTOOL_H__

#include "tinyxml2.h" 
#include "WebPage.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <unordered_set>
#include <iomanip>

using namespace tinyxml2;
using std::vector;
using std::string;
using std::ofstream;
using std::ifstream;
using std::unordered_set;
using std::cerr;


class SplitTool
{
public :
    SplitTool();
    ~SplitTool();
    virtual vector<string> cut(const string &);//切割英文文章

    vector<WebPage> split_XML(const string&);//切割xml文件
    void save_to_JSON(vector<WebPage>&, const string&);//保存切割后的xml文件成JSON格式
private :
    vector<string> tokenize(const string &);//清洗无效字符
    unordered_set<string> StopWords(const string & filename);//加载停用词
    
};

#endif
